In [1]:
import numpy as np
import pandas as pd

## Plotly plotting support
# import plotly.plotly as py

import plotly.offline as py
import plotly.figure_factory as ff
py.init_notebook_mode()

import cufflinks as cf
cf.go_offline() # required to use plotly offline (no account required).

import plotly.graph_objs as go
In [7]:
np.random.seed(42)

n = 75      # Number of records 

noise = 3.5  # Noise in observations (we wouldn't know this in real life)
m = 1.5      # The true slope (we wouldn't know this in real life) 
b = 10.0      # The true intercept (we wouldn't know this in real life)

# Make the data --------------------------
X = np.random.rand(n) * 20. - 10.
X.sort()
# The Y values are created using the secret model 
#      (We wouldn't have this in real-life either)
Y = m * X + b * np.sin(X) + np.random.randn(n) * noise 
Y[20] = 7
Y[23] = -18
Y[30] = 20
Y[55] = -18
Y += 20
# Y[40] = -20
data = pd.DataFrame(dict(X = X, Y = Y))
del X, Y

data.to_csv("toy_training_data.csv", index=False)

raw_data = go.Scatter(name = "Data", x = data['X'], y = data['Y'], mode = 'markers')
py.iplot([raw_data])
In [8]:
np.random.seed(37)

n = 50      # Number of records 

# Make the data --------------------------
X = np.random.rand(n) * 20. - 10.
X.sort()
# The Y values are created using the secret model 
#      (We wouldn't have this in real-life either)
Y = m * X + b * np.sin(X) + np.random.randn(n) * noise
Y[10] = 7
Y[23] = -7
Y[1] = -18

Y += 20 
test_data = pd.DataFrame(dict(X = X, Y = Y))
del X, Y

test_data.to_csv("toy_test_data.csv", index=False)
In [3]:
np.random.seed(42)

flavor_prices = {
    "Vanilla": 0.75,
    "Chocolate": 0.8,
    "Strawberry": 0.5
}

topping_prices = {
    "Sprinkles": 0.3,
    "Fruit": 1.0,
    "Chocolate": 0.5,
    "None": 0.0
}

n = 200
weights = np.random.rand(n) * 4 + 1.
flavors = np.random.choice(list(flavor_prices.keys()), n)
toppings = np.random.choice(list(topping_prices.keys()), n)
price = np.array([ "%.2f" % (w * flavor_prices[f] + topping_prices[t])
    for (w,f,t) in zip(weights, flavors, toppings)]).astype('float')

icecream = pd.DataFrame({
    "mass": np.round(weights,1),
    "flavor": flavors,
    "topping": toppings,
    "price": price
}, columns=["flavor", "topping", "mass", "price"])

icecream.iloc[0:150,:].to_csv("icecream_train.csv", index=False)
icecream.iloc[150:,:].to_csv("icecream_test.csv", index=False)
In [122]:
icecream.head()
Out[122]:
flavor topping weight price
0 Chocolate Chocolate 3.6 3.35
1 Chocolate Sprinkles 5.0 4.27
2 Chocolate None 3.3 2.68
3 Vanilla Fruit 3.7 3.74
4 Vanilla Chocolate 2.2 2.12
In [123]:
d = pd.get_dummies(df)
In [124]:
from sklearn.feature_extraction import DictVectorizer

flavor_enc = DictVectorizer()
flavor_enc.fit(icecream[["flavor"]].to_dict(orient='records'))
onehot_flavor = flavor_enc.transform(icecream[["flavor"]].to_dict(orient='records'))
In [125]:
topping_enc = DictVectorizer()
topping_enc.fit(icecream[["topping"]].to_dict(orient='records'))
onehot_topping = topping_enc.transform(icecream[["topping"]].to_dict(orient='records'))
In [136]:
import scipy as sp
f1 = sp.sparse.spdiags(icecream['weight'].values, 0, n, n) * onehot_flavor
phi = sp.sparse.hstack((f1, onehot_topping))
In [127]:
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=False)
reg.fit(phi, icecream['price'])
yhat = reg.predict(phi)
In [130]:
np.round(reg.coef_,2)
Out[130]:
array([ 0.8 ,  0.5 ,  0.75,  0.5 ,  1.  ,  0.  ,  0.3 ])
In [141]:
q = yhat - icecream['price']
Out[141]:
-0.043452525523396179
In [143]:
import plotly.figure_factory as ff

py.iplot(ff.create_distplot([yhat - icecream['price']], group_labels=["residuals"], bin_size=0.001))
In [4]:
onehot_flavor * icecream['mass'].values[:, np.newaxis]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-4f236c9fa5ad> in <module>()
----> 1 onehot_flavor * icecream['mass'].values[:, np.newaxis]

NameError: name 'onehot_flavor' is not defined
In [51]:
len(icecream['weight'].values)
Out[51]:
500
In [59]:
import scipy.sparse
In [62]:
scipy.sparse.spdiags(icecream['weight'].values, 0, n, n)  * onehot_flavor
    
    
Out[62]:
<500x3 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>